# -*- coding: utf8 -*-
import requests, json, re, time,threading,MySQLdb

header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'}
url = "http://www.huitongranqi.com/news_list/newsCategoryId=9&FrontNews_list01-1374743535367_pageNo=1&FrontNews_list01-1374743535367_pageSize=20.html"
db = MySQLdb.connect(host='localhost',user='root',passwd='',db='huitongranqi',charset='utf8')
cursor = db.cursor()
n_catid = 12
sql = "insert into v9_news (`catid`, `typeid`, `title`, `style`, `thumb`, `keywords`, `description`, `posids`, `url`, `listorder`, `status`, `sysadd`, `islink`, `username`, `inputtime`, `updatetime`) values("+str(n_catid)+",0,%s,'','','','',0,'',0,99,1,0,'admin',%s,%s)"
c_sql = "INSERT INTO `v9_news_data`(`id`, `content`, `readpoint`, `groupids_view`, `paginationtype`, `maxcharperpage`, `template`, `paytype`, `relation`, `voteid`, `allow_comment`, `copyfrom`)VALUES (%s,%s,0,'',0,10000,'',0,'',0,1,'|0')"

def get_allpage():  # 获取所有分页
    urls = []

    for i in range(1, 5):
        url = 'http://www.huitongranqi.com/news_list/newsCategoryId=6&FrontNews_list01-1374743535367_pageNo=' + str(
            i) + '&FrontNews_list01-1374743535367_pageSize=20.html'
        urls.append(url)
    return urls

def get_image(*img_url):
    for img in img_url:
        img_name = re.search(r'^.*/(.*)$',img)
        if img_name:
            img_name = img_name.group(1)
        else:
            return None

        img_t = requests.get('http://www.huitongranqi.com/'+img)
        f = open('E:\python\python_test\pachong\huitong\\'+img_name,'wb')
        f.write(img_t.content)
        print u'图片下载完成'
        f.close()

def get_detail(news_url):
    num = 0
    for new_url in news_url:
        new_url = 'http://www.huitongranqi.com/' + new_url
        new_detail = requests.get(new_url, headers=header)
        g_title = re.search(r'<div id="newsdetailshow" >.*?<h2>(.*?)</h2>', new_detail.text, re.S).group(1)

        #time
        g_time = re.search(r'<span class="date"><em>.*?</em>(.*?)\n</span>', new_detail.text, re.S).group(1)
        g_time.replace(u'年','-')
        strinfo = re.compile(u'年')
        strinfo1 = re.compile(u'月')
        strinfo2 = re.compile(u'日')
        g_time = strinfo.sub('-',g_time)
        g_time = strinfo1.sub('-',g_time)
        g_time = strinfo2.sub('',g_time)
        g_time = int(time.mktime(time.strptime(g_time,'%Y-%m-%d %H:%M')))
        #end time

        g_content = re.search(r'id="infoContent">(.*?)<div class="operate">', new_detail.text, re.S).group(1)
        g_content = '<div>'+g_content
        g_content = g_content.strip()

        img_url = re.findall(r'src="(.*?)"',g_content,re.S)
        if img_url:
            t = threading.Thread(target=get_image,args=img_url)#图片下载线程
            t.start()

        num = num+1
        cursor.execute(sql,(g_title,g_time,g_time))
        #cursor.execute("UPDATE `v9_news` SET `url` = '/index.php?m=content&c=index&a=show&catid=10&id='%s whers id = %s",(str(cursor.lastrowid),str(cursor.lastrowid)))
        cursor.execute(c_sql,(int(cursor.lastrowid),g_content))
        # if num == 2:
        #     exit()
    return num


def get_all_url():
    page = get_allpage()
    all_news_url = []
    num = 0
    for p in page:
        get_news_url = requests.get(p, headers=header)
        news_url = re.findall(r'<li class="title">.*?<h3>.*?<a href="(.*?)"', get_news_url.text, re.S)
        # all_news_url_new = map(lambda x:'http://www.huitongranqi.com'+x,news_url)
        #get_detail(news_url)
        num = num + get_detail(news_url)
    print u'共爬取的新闻条数：'+str(num)


url = get_all_url()
cursor.close()
db.commit()
db.close()
